We use here data from Media cloud with only title of news. Therfore the program is more simple than in course.
# Load data with the function fread (fast) and the encoding UTF-8
df1<-fread("data/source/en_GBR_guardi_2019_2020.csv", encoding = "UTF-8")
df1$media<-"en_GBR_guardi"
df2<-fread("data/source/en_GBR_telegr_2019_2020.csv", encoding = "UTF-8")
df2$media<-"en_GBR_telegr"
# transform in data.table format
df<-rbind(df1,df2)
rm(df1,df2)
# select column of interest
df$id <- df$stories_id
df$who <- df$media
df$when <- as.Date(df$publish_date)
df$text <- df$title
df<-df[,c("id","who","when","text")]
df<-df[order(when),]
# select period of interest
mintime<-as.Date("2019-01-01")
maxtime<-as.Date("2020-12-31")
df<-df[(is.na(df$when)==F),]
df<-df[as.Date(df$when) >= mintime,]
df<-df[as.Date(df$when) <= maxtime,]
# eliminate duplicate
df<-df[duplicated(df$text)==F,]
We transform the previous data.frame in a data.table format for easier operations of aggregation
dt<-as.data.table(df)
dt$day <- as.Date(dt$when)
dt$week <- cut(dt$day, "weeks", start.on.monday=TRUE)
dt$month <- cut(dt$day, "months")
dt$weekday <- weekdays(dt$day)
# Save data frame
saveRDS(dt,"data/corpus/dt_mycorpus.RDS")
We examine if the distribution is regular by week for the different media of the corpus.
dt<-readRDS("data/corpus/dt_mycorpus.RDS")
news_weeks<-dt[,.(newstot=.N),by=.(week,who)]
p<-ggplot(news_weeks, aes(x=as.Date(week),y=newstot, col=who))+
geom_line()+
geom_smooth(method = 'loess', formula = 'y~x')+
scale_y_continuous("Number of news", limits = c(0,NA)) +
scale_x_date("Week (starting on monday)") +
ggtitle(label ="Corpus : distribution of news by week")
p
We examine if the distribution is regular by weekday and check in particular the effect of the week-end.
#compute frequencies by weekday
news_weekdays<-dt[,.(newstot=.N),by=.(weekday,who)]
news_weekdays<-news_weekdays[,.(weekday,newspct=100*newstot/sum(newstot)),by=.(who)]
# Translate weekdays in english and order
news_weekdays$weekday<-as.factor(news_weekdays$weekday)
levels(news_weekdays$weekday)<-c("7.Sunday","4.Wednesday","1.Monday","2.Tuesday","3.Thursday","6.Sathurday","5.Friday")
news_weekdays$weekday<-as.factor(as.character(news_weekdays$weekday))
news_weekdays<-news_weekdays[order(news_weekdays$weekday),]
p<-ggplot(news_weekdays, aes(x=weekday,fill = who, y=newspct))+
geom_bar(position = "dodge", stat="identity")+
scale_y_continuous("Share of news (%)", limits = c(0,NA)) +
ggtitle(label ="Corpus : distribution of news by week day")
p
# transform in quanteda
qd<-corpus(dt,docid_field = "id",text_field = "text")
# filter by number of tokens by sentence
qd$nbt<-ntoken(texts(qd))
qd<-corpus_subset(qd, nbt<100)
qd<-corpus_subset(qd, nbt>2)
# Save corpus in qd format
saveRDS(qd,"data/corpus/qd_mycorpus.RDS")
head(qd)
Corpus consisting of 6 documents and 7 docvars.
1128777462 :
"Apprehension on all sides before launch of Irish abortion se..."
1128777446 :
"Toxic legacy taints ANC as it nears 25-year rule in South Af..."
1128795170 :
"Preparations for the Harbin ice and snow festival – in pictu..."
1128795156 :
"'Resign from Facebook': experts offer Mark Zuckerberg advice..."
1128795141 :
"Gender pay gap: 2018 brought transparency – will 2019 bring ..."
1128795146 :
"It's a thong thing: why £16 flip-flops are the shoe of summe..."
summary(qd,3)
Corpus consisting of 103386 documents, showing 3 documents:
Text Types Tokens Sentences who when day week month
1128777462 10 10 1 en_GBR_guardi 2019-01-01 2019-01-01 2018-12-31 2019-01-01
1128777446 12 12 1 en_GBR_guardi 2019-01-01 2019-01-01 2018-12-31 2019-01-01
1128795170 11 11 1 en_GBR_guardi 2019-01-01 2019-01-01 2018-12-31 2019-01-01
weekday nbt
Mardi 10
Mardi 12
Mardi 11
#' @title create an hypercube
#' @name hypercube
#' @description create a network of interlinked states
#' @param corpus a corpus of news in quanteda format
#' @param order an order of sentences in the news
#' @param who the source dimension
#' @param when the time dimension
#' @param timespan aggreation of time
#' @param what a list of topics
#' @param where1 a list of states
#' @param where2 a list of states
hypercube <- function( corpus = qd,
order = "order",
who = "source",
when = "when",
timespan = "week",
what = "what",
where1 = "where1",
where2 = "where2")
{
# prepare data
don<-docvars(corpus)
df<-data.table(id = docid(corpus),
order = don[[order]],
who = don[[who]],
when = don[[when]],
what = don[[what]],
where1 = don[[where1]],
where2 = don[[where2]])
# adjust id
df$id<-paste(df$id,"_",df$order,sep="")
# change time span
df$when<-as.character(cut(as.Date(df$when), timespan, start.on.monday = TRUE))
# unnest where1
df$where1[df$where1==""]<-"_no_"
df<-unnest_tokens(df,where1,where1,to_lower=F)
# unnest where2
df$where2[df$where2==""]<-"_no_"
df<-unnest_tokens(df,where2,where2,to_lower=F)
# unnest what
df$what[df$what==""]<-"_no_"
df<-unnest_tokens(df,what,what,to_lower=F)
# Compute weight of news
newswgt<-df[,list(wgt=1/.N),list(id)]
df <- merge(df,newswgt, by="id")
# ------------------------ Hypercube creation --------------------#
# Aggregate
hc<- df[,.(tags = .N, news=sum(wgt)) ,.(order,who, when,where1,where2, what)]
# Convert date to time
hc$when<-as.Date(hc$when)
# export
return(hc)
}
qd<-readRDS("data/corpus/qd_mycorpus_states_topic.RDS")
qd$order<-1
hc<-hypercube( corpus = qd,
order = "order",
who = "who",
when = "when",
timespan = "day",
what = "pand",
where1 = "states",
where2 = "states")
saveRDS(hc,"data/corpus/hc_mycorpus_states_pand.RDS")
paste("Size of resulting file = ",round(file.size("data/corpus/hc_mycorpus_states_pand.RDS")/1000000,3), "Mo")
[1] "Size of resulting file = 0.133 Mo"
#### ---------------- testchi2 ----------------
#' @title Compute the average salience of the topic and test significance of deviation
#' @name what
#' @description create a table and graphic of the topic
#' @param tabtest a table with variable trial, success and null.value
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest : Threshold of estimated value requested for chi-square test
testchi2<-function(tabtest=tabtest,
minsamp = 20,
mintest = 5)
{
tab<-tabtest
n<-dim(tab)[1]
# Compute salience if sample size sufficient (default : N>20)
tab$estimate <-NA
tab$salience <-NA
tab$chi2<-NA
tab$p.value<-NA
if (tab$trial > minsamp){ tab$estimate<-round(tab$success/tab$trial,5)
tab$salience<-tab$estimate/tab$null.value
# Chi-square test if estimated value sufficient (default : Nij* > 5)
for (i in 1:n) {
if(tab$trial[i]*tab$null.value[i]>=mintest) {
test<-prop.test(x=tab$success[i],n=tab$trial[i], p=tab$null.value[i],
alternative = "greater")
tab$chi2[i]<-round(test$statistic,2)
tab$p.value[i]<-round(test$p.value,5)
}
}
}
return(tab)
}
hc <- readRDS("data/corpus/hc_mycorpus_states_pand.RDS")
### ---------------- what ----------------
#' @title Compute the average salience of the topic
#' @name what
#' @description create a table and graphic of the topic
#' @param hc an hypercube prepared as data.table
#' @param subtop a subtag of the main tag (default = NA)
#' @param title Title of the graphic
what <- function (hc = hypercube,
subtop = NA,
title = "What ?")
{
tab<-hc
if (is.na(subtop)){tab$what <-tab$what !="_no_"}else {tab$what <- tab$what == subtop}
tab<-tab[,list(news = sum(news)),by = what]
tab$pct<-100*tab$news/sum(tab$news)
p <- plot_ly(tab,
labels = ~what,
values = ~pct,
type = 'pie') %>%
layout(title = title,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
output<-list("table" = tab, "plotly" =p)
return(output)
}
res_what <- what(hc = hc,
subtop = NA,
title = "Topic news")
res_what$table
res_what$plotly
The function who.what explore the variation of interest for the topic in the different media of the corpus.
#### ---------------- who.what ----------------
#' @title visualize variation of the topic between media
#' @name who.what
#' @description create a table of variation of the topic by media
#' @param hc an hypercube prepared as data.table
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param title Title of the graphic
who.what <- function (hc = hypercube,
test = FALSE,
minsamp = 20,
mintest = 5,
title = "Who says What ?")
{
tab<-hc
{tab$what <-tab$what !="_no_"}
tab<-tab[,list(trial = sum(news),success=round(sum(news*what),0)),by = list(who)]
ref <-round(sum(tab$success)/sum(tab$trial),4)
tab$null.value<-ref
tab<-testchi2(tabtest=tab,
minsamp = minsamp,
mintest = mintest)
if (test==FALSE) {tab$index =tab$salience
tab<-tab[tab$trial > minsamp,]
mycol<-brewer.pal(7,"YlOrRd")
}
else {tab$index=tab$p.value
tab<-tab[tab$trial*tab$null.value>mintest,]
mycol<-brewer.pal(7,"RdYlBu")
mycol[4]<-"lightyellow"
}
p <- plot_ly(tab,
x = ~who,
y = ~estimate*100,
color= ~index,
colors= mycol,
hoverinfo = "text",
text = ~paste('Source: ',who,
'<br /> Total news : ', round(trial,0),
'<br /> Topic news : ', round(success,0),
'<br /> % observed : ', round(estimate*100,2),'%',
'<br /> % estimated : ', round(null.value*100,2),'%',
'<br /> Salience : ', round(salience,2),
'<br /> p.value : ', round(p.value,4)),
type = "bar") %>%
layout(title = title,
yaxis = list(title = "% news"),
barmode = 'stack')
output<-list("table" = tab, "plotly" =p)
return(output)
}
res_who_what<- who.what(hc=hc,
test = TRUE,
minsamp = 5,
mintest = 1,
title = "Topic news by media - Significance")
res_who_what$plotly
NA
#### ---------------- when.what ----------------
#' @title visualize variation of the topic through time
#' @name when.what
#' @description create a table of variation of the topic by media
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param title Title of the graphic
when.what <- function (hc = hypercube,
test = FALSE,
minsamp = 20,
mintest = 5,
title = "Who says What ?")
{
tab<-hc
{tab$what <-tab$what !="_no_"}
tab<-tab[,list(trial = sum(news),success=round(sum(news*what),0)),by = list(when)]
ref <-round(sum(tab$success)/sum(tab$trial),4)
tab$null.value<-ref
tab<-testchi2(tabtest=tab,
minsamp = minsamp,
mintest = mintest)
if (test==FALSE) {tab$index =tab$salience
tab<-tab[tab$trial > minsamp,]
mycol<-brewer.pal(7,"YlOrRd")
}
else {tab$index=tab$p.value
tab<-tab[tab$trial*tab$null.value>mintest,]
mycol<-brewer.pal(7,"RdYlBu")
mycol[4]<-"lightyellow"
}
p <- plot_ly(tab,
x = ~as.character(when),
y = ~estimate*100,
color= ~index,
colors= mycol,
hoverinfo = "text",
text = ~paste('Time: ',when,
'<br /> Total news : ', round(trial,0),
'<br /> Topic news : ', round(success,0),
'<br /> % observed : ', round(estimate*100,2),'%',
'<br /> % estimated : ', round(null.value*100,2),'%',
'<br /> Salience : ', round(salience,2),
'<br /> p.value : ', round(p.value,4)),
type = "bar") %>%
layout(title = title,
yaxis = list(title = "% news"),
barmode = 'stack')
output<-list("table" = tab, "plotly" =p)
return(output)
}
# Modify time period by month
hc2 <- hc %>% mutate(when = cut(when,breaks="month"))
res_when_what<- when.what(hc=hc2,
test=TRUE,
minsamp=10,
mintest=5,
title = "Topic news by month - Significance")
res_when_what$plotly
# Modify time period by month
hc2 <- hc %>% filter(substr(when,1,4)=="2020") %>% mutate(when = cut(when,breaks="week"))
res_when_what<- when.what(hc=hc2,
test=TRUE,
minsamp=10,
mintest=5,
title = "Topic news by week - Significance")
res_when_what$plotly
# Modify time period by month
hc2 <- hc %>% filter(when > as.Date("2020-01-01"), when < as.Date("2020-04-01"))
res_when_what<- when.what(hc=hc2,
test=TRUE,
minsamp=10,
mintest=5,
title = "Topic news by day - Significance")
res_when_what$plotly
#### ---------------- where.what ----------------
#' @title visualize spatialization of the topic
#' @name where.what
#' @description create a table of variation of the topic by media
#' @param hc an hypercube prepared as data.table
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param map a map with coordinates in lat-long
#' @param proj a projection accepted by plotly
#' @param title Title of the graphic
where.what <- function (hc = hypercube,
test = FALSE,
minsamp = 20,
mintest = 5,
map = world_ctr,
proj = 'azimuthal equal area',
title = "Where said What ?")
{
tab<-hc
tab$what <-tab$what !="_no_"
tab<-tab[,list(trial = round(sum(news),0),success=round(sum(news*what),0)),by = list(where1)]
ref <-round(sum(tab$success)/sum(tab$trial),4)
tab$null.value<-ref
tab<-testchi2(tabtest=tab,
minsamp = minsamp,
mintest = mintest)
tab<-tab[order(-chi2),]
if (test==FALSE) {tab$index =tab$salience
tab<-tab[tab$trial > minsamp,]
mycol<-brewer.pal(7,"YlOrRd")
}
else {tab$index=tab$p.value
tab<-tab[tab$trial*tab$null.value>mintest,]
mycol<-brewer.pal(7,"RdYlBu")
mycol[4]<-"lightyellow"
}
map<-merge(map,tab,all.x=T,all.y=F,by.x="ISO3",by.y="where1")
#map2<-map[is.na(map$pct)==F,]
#map2<-st_centroid(map2)
#map2<-st_drop_geometry(map2)
g <- list(showframe = TRUE,
framecolor= toRGB("gray20"),
coastlinecolor = toRGB("gray20"),
showland = TRUE,
landcolor = toRGB("gray50"),
showcountries = TRUE,
countrycolor = toRGB("white"),
countrywidth = 0.2,
projection = list(type = proj))
p<- plot_geo(map)%>%
add_markers(x = ~lon,
y = ~lat,
sizes = c(0, 250),
size = ~success,
# color= ~signif,
color = ~index,
colors= mycol,
hoverinfo = "text",
text = ~paste('Location: ',NAME,
'<br /> Total news : ', round(trial,0),
'<br /> Topic news : ', round(success,0),
'<br /> % observed : ', round(estimate*100,2),'%',
'<br /> % estimated : ', round(null.value*100,2),'%',
'<br /> Salience : ', round(salience,2),
'<br /> p.value : ', round(p.value,4))) %>%
layout(geo = g,
title = title)
output<-list("table" = tab, "plotly" =p)
return(output)
}
map<-readRDS("data/dico_states/world_ctr_4326.Rdata")
hc2<-hc %>% filter(where1 !="_no_", where2 !="_no_") %>% filter(where1 !="GBR", where2 !="GBR")
res_where_what<- where.what(hc=hc2,
test=TRUE,
minsamp=10,
map = map,
mintest =2,
title = "Topic news by states - Significance")
res_where_what$plotly
hc_filter <- function(don = hc,
who = "who",
when = "when",
where1 = "where1",
where2 = "where2",
wgt = "tags",
self = FALSE,
when_start = NA,
when_end = NA,
who_exc = NA,
who_inc = NA,
where1_exc = NA,
where1_inc = NA,
where2_exc = NA,
where2_inc = NA)
{
df<-data.table(who = don[[who]],
when = don[[when]],
where1 = don[[where1]],
where2 = don[[where2]],
wgt = don[[wgt]])
# Select time period
if (is.na(when_start)==FALSE) {
df <- df[when >= as.Date(when_start), ]}
if (is.na(when_end)==FALSE) {
df <- df[when <= as.Date(when_end), ]}
# Select who
if (is.na(who_exc)==FALSE) {
df <- df[!(who %in% who_exc), ]}
if (is.na(who_inc)==FALSE) {
df <- df[(who %in% who_inc), ]}
# Select where1
if (is.na(where1_exc)==FALSE) {
df <- df[!(where1 %in% where1_exc), ]}
if (is.na(where1_inc)==FALSE) {
df <- df[(where1 %in% where1_inc), ]}
# Select where2
if (is.na(where2_exc)==FALSE) {
df <- df[!(where2 %in% where2_exc), ]}
if (is.na(where2_inc)==FALSE) {
df <- df[(where2 %in% where2_inc), ]}
# eliminate internal links
if (self==FALSE) {
df <- df[(where1 != where2), ]}
return(df)
}
build_int <- function(don = don, # a dataframe with columns i, j , Fij
i = "where1",
j = "where2",
Fij = "wgt",
s1 = 10,
s2 = 10,
n1 = 2,
n2 = 2,
k = 1)
{
df<-data.table(i=don[[i]],j=don[[j]],Fij=don[[Fij]])
int <-df[,.(Fij=sum(Fij)),.(i,j)]
int<-dcast(int,formula = i~j,fill = 0)
mat<-as.matrix(int[,-1])
row.names(mat)<-int$i
mat<-mat[apply(mat,1,sum)>=s1,apply(mat,2,sum)>=s2 ]
m0<-mat
m0[m0<k]<-0
m0[m0>=k]<-1
mat<-mat[apply(m0,1,sum)>=n1,apply(m0,2,sum)>=n2 ]
int<-reshape2::melt(mat)
names(int) <-c("i","j","Fij")
return(int)
}
rand_int <- function(int = int, # A table with columns i, j Fij
maxsize = 100000,
diag = FALSE,
resid = FALSE) {
# Eliminate diagonal ?
if (diag==FALSE) {
int <- int[as.character(int$i) != as.character(int$j), ]}
# Compute model if size not too large
if (dim(int)[1] < maxsize) {
# Proceed to poisson regression model
mod <- glm( formula = Fij ~ i + j,family = "poisson", data = int)
# Add residuals if requested
if(resid == TRUE) {
# Add estimates
int$Eij <- mod$fitted.values
# Add absolute residuals
int$Rabs_ij <- int$Fij-int$Eij
# Add relative residuals
int$Rrel_ij <- int$Fij/int$Eij
# Add chi-square residuals
int$Rchi_ij <- (int$Rabs_ij)**2 / int$Eij
int$Rchi_ij[int$Rabs_ij<0]<- -int$Rchi_ij[int$Rabs_ij<0]
}
} else { paste ("Table > 100000 - \n
modify maxsize = parameter \n
if you are sure that your computer can do it !")}
# Export results
int$i<-as.character(int$i)
int$j<-as.character(int$j)
return(int)
}
geo_network<- function(don = don,
from = "i",
to = "j",
size = "Fij",
minsize = 1,
maxsize = NA,
test = "Fij",
mintest = 1,
loops = FALSE,
title = "Network")
{
int<-data.frame(i = as.character(don[,from]),
j = as.character(don[,to]),
size = don[,size],
test = don[,test]
)
if (is.na(minsize)==FALSE) {int =int[int$size >= minsize,]}
if (is.na(maxsize)==FALSE) {int =int[int$size <= maxsize,]}
if (is.na(mintest)==FALSE) {int =int[int$test >= mintest,]}
nodes<-data.frame(code = unique(c(int$i,int$j)))
nodes$code<-as.character(nodes$code)
nodes$id<-1:length(nodes$code)
nodes$label<-nodes$code
nodes$color <-"gray"
nodes$color[nodes$code %in% int$j]<-"red"
# Adjust edge codes
edges <- int %>% mutate(width = 5+5*size / max(size)) %>%
left_join(nodes %>% select(i=code, from = id)) %>%
left_join(nodes %>% select(j=code, to = id ))
# compute nodesize
toti<-int %>% group_by(i) %>% summarize(size =sum(size)) %>% select (code=i,size)
totj<-int %>% group_by(j) %>% summarize(size =sum(size)) %>% select (code=j,size)
tot<-rbind(toti,totj)
tot<-unique(tot)
tot$code<-as.factor(tot$code)
nodes <- left_join(nodes,tot) %>% mutate(value = 1 +5*sqrt(size/max(size)))
#sel_nodes <-nodes %>% filter(code %in% unique(c(sel_edges$i,sel_edges$j)))
# eliminate loops
if(loops == FALSE) {edges <- edges[edges$from < edges$to,]}
net<- visNetwork(nodes,
edges,
main = title,
height = "1000px",
width = "70%") %>%
visNodes(scaling =list(min =20, max=60,
label=list(min=20,max=80,
maxVisible = 20)))%>%
visEdges(scaling = list(min=20,max=60))%>%
visOptions(highlightNearest = TRUE,
# selectedBy = "group",
# manipulation = TRUE,
nodesIdSelection = TRUE) %>%
visInteraction(navigationButtons = TRUE) %>%
visLegend() %>%
visIgraphLayout(layout ="layout.fruchterman.reingold",smooth = TRUE)
net
return(net)
}
# Load complete hypercube
hc <- readRDS("data/corpus/hc_mycorpus_states_pand.RDS")
hc <- hc %>% filter(where1 !="_no_", where2 != "_no_")
# Eliminate non foreign news
hc<-hc[hc$where1 != substr(who,4,6),]
hc<-hc[hc$where2 != substr(who,4,6),]
# Add complete labels
map<-readRDS("data/dico_states/world_map_4326.Rdata")
labs<-st_drop_geometry(map)
labs<-labs[,c(1,4)]
# Shorten the name of USA
labs$NAME[labs$ISO3=="USA"]<-"U.S.A."
names(labs)<-c("where1","geofr1")
hc<-left_join(hc,labs)
Joining, by = "where1"
names(labs)<-c("where2","geofr2")
hc<-left_join(hc,labs)
Joining, by = "where2"
hc_geo_geo <- hc
hc<-hc_geo_geo
hc<-hc_filter(don = hc,
wgt = "news",
where1 = "geofr1",
where2 = "geofr2",
where1_exc = c("_no_"),
where2_exc = c("_no_"),
self = FALSE
)
int <- build_int(don = hc,
s1=2,
s2=2,
n1=1,
n2=1,
k=0)
Using 'Fij' as value column. Use 'value.var' to override
mod<-rand_int(int,
resid = TRUE,
diag = FALSE)
network<- geo_network(mod,
size = "Fij",
minsize = 1,
test = "Rchi_ij",
mintest = 3.84)
Joining, by = "i"
Joining, by = "j"
`summarise()` ungrouping output (override with `.groups` argument)
`summarise()` ungrouping output (override with `.groups` argument)
Joining, by = "code"
network
NA